PATH = /Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/R_figures

setwd(“~/Box Sync/CODE_DATA/dir_MAPpaper/R_figures”)

setwd(“~/Box Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX”)

merge files created with - Pmap_Rcombine.py

projects

  1. detected vs other
  2. groups - reactive & panTarget - hg19 vs hg38
  3. specific changes “CADDspecific”

combined all cys and lys detected pos specific rows with non detected version to do DETECTED VS NOT comparison below:

# cys merge of all detected and non detected rows
cmerg <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/POSspecific_Cdet_not_merge.csv") %>% 
  mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
  mutate(group = factor(group, levels = c(
    "Detected",
    "Other"
  )))

# lys merge 
kmerg <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/POSspecific_Kdet_not_merge.csv") %>% 
  mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
  mutate(group = factor(group, levels = c(
    "Detected",
    "Other"
  )))
summary(cmerg)
##          index          CADD_avg           CADD_max      assembly    
##  A0AVT1_C156:    2   Min.   : 0.03643   Min.   : 0.116   hg19:41080  
##  A0AVT1_C17 :    2   1st Qu.:23.15714   1st Qu.:24.800   hg38:41080  
##  A0AVT1_C174:    2   Median :25.84286   Median :28.200               
##  A0AVT1_C178:    2   Mean   :24.90547   Mean   :27.701               
##  A0AVT1_C197:    2   3rd Qu.:28.14286   3rd Qu.:32.000               
##  A0AVT1_C231:    2   Max.   :35.00000   Max.   :59.000               
##  (Other)    :82148                                                   
##       group      
##  Detected:12300  
##  Other   :69860  
##                  
##                  
##                  
##                  
## 
summary(kmerg)
##           index           CADD_avg           CADD_max      assembly     
##  A0AVT1_K1003:     2   Min.   : 0.01914   Min.   : 0.059   hg19:154029  
##  A0AVT1_K1006:     2   1st Qu.:22.64286   1st Qu.:24.300   hg38:154029  
##  A0AVT1_K1011:     2   Median :24.51429   Median :27.000                
##  A0AVT1_K1014:     2   Mean   :24.11317   Mean   :26.966                
##  A0AVT1_K1019:     2   3rd Qu.:26.54286   3rd Qu.:29.700                
##  A0AVT1_K1020:     2   Max.   :36.28571   Max.   :57.000                
##  (Other)     :308046                                                    
##       group       
##  Detected: 18004  
##  Other   :290054  
##                   
##                   
##                   
##                   
## 

Cys det vs other

Cm <- cmerg %>% 
  ggplot(aes(x=assembly, y=CADD_avg, fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Average missense CADD scores for detected vs non-detected Cysteines") +
  ylab("CADD score") +
  xlab("Assembly groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35))

Cm

ggsave("Cmerge_det_vs_not_violin.png", width = 7, height=4, dpi= 300)

cys max CADD version

Cmax <- cmerg %>% 
  ggplot(aes(x=assembly, y=CADD_max,fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Max missense CADD scores for detected vs not detected Cysteines") +
  ylab("CADD score") +
  xlab("Assembly groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60))

Cmax

ggsave("CmergeMAX_det_vs_not_violin.png", width = 7, height=4, dpi= 300)

Lys det vs other

Km <- kmerg %>% 
  ggplot(aes(x=assembly, y=CADD_avg,  fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Average missense CADD scores for detected vs not detected Lysines") +
  ylab("CADD score") +
  xlab("Assembly groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35))

Km

ggsave("Kmerge_det_vs_not_violin.png", width = 7, height=4, dpi= 300)

Lys max cadd version

Kmax <- kmerg %>% 
  ggplot(aes(x=assembly, y=CADD_max, fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Max missense CADD scores for detected vs not detected Lysines") +
  ylab("CADD score") +
  xlab("Assembly groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60))

Kmax

ggsave("KmergeMAX_det_vs_not_violin.png", width = 7, height=4, dpi= 300)

CADD hg19 vs hg38 and detected ( reactivity / panTarget ) data

postion specific files, averages and max CADD scores at postions

  • POSspecific_C_detected_maxmean_6150x2pos.csv
  • POSspecific_C_notdetected_maxmean_34930x2pos.csv
  • POSspecific_K_detected_maxmean_9002x2pos.csv
  • POSspecific_K_notdetected_maxmean_145027x2pos.csv
Cdet <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/POSspecific_C_detected_maxmean_6150x2pos.csv") %>% 
  mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
  mutate(Cys_react_threshold = factor(Cys_react_threshold, levels = c(
    "Low",
    "Medium",
    "High"
  ))) %>% 
  mutate(Cys_target_label = factor(Cys_target_label, levels = c(
    "Target",
    "panReactive"
  )))


Cnot <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/POSspecific_C_notdetected_maxmean_34930x2pos.csv") %>% 
  mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  )))


Kdet <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/POSspecific_K_detected_maxmean_9002x2pos.csv") %>% 
  mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
  mutate(Lys_react_threshold = factor(Lys_react_threshold, levels = c(
    "Low",
    "Medium",
    "High"
  ))) %>% 
  mutate(Lys_target_label = factor(Lys_target_label, levels = c(
    "Target",
    "panReactive"
  )))

Knot <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/POSspecific_K_notdetected_maxmean_145027x2pos.csv") %>% 
  mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  )))
summary(Cdet)
##          index            xref       Cys_reactivity   Cys_react_threshold
##  A0AVT1_C347:    2   P21333 :   50   Min.   : 0.740   Low   :1858        
##  A0AVT1_C433:    2   P49327 :   50   1st Qu.: 4.320   Medium: 882        
##  A0AVT1_C546:    2   Q15149 :   44   Median : 5.575   High  : 188        
##  A0AVT1_C625:    2   P78527 :   40   Mean   : 5.927   NA's  :9372        
##  A0AVT1_C721:    2   Q7Z6Z7 :   40   3rd Qu.: 6.830                      
##  A0AVT1_C770:    2   O75369 :   36   Max.   :20.000                      
##  (Other)    :12288   (Other):12040   NA's   :9372                        
##     Cys_target_label geneNamePrimary    CADD_avg           CADD_max     
##  Target     : 1424   FASN   :   50   Min.   : 0.06386   Min.   : 0.396  
##  panReactive:10250   FLNA   :   50   1st Qu.:22.05679   1st Qu.:23.800  
##  NA's       :  626   PLEC   :   44   Median :24.80000   Median :27.100  
##                      HUWE1  :   40   Mean   :23.91074   Mean   :26.942  
##                      PRKDC  :   40   3rd Qu.:27.60000   3rd Qu.:31.000  
##                      DYNC1H1:   36   Max.   :34.71429   Max.   :59.000  
##                      (Other):12040                                      
##  assembly   
##  hg19:6150  
##  hg38:6150  
##             
##             
##             
##             
## 
summary(Cnot)
##          index            xref       geneNamePrimary    CADD_avg       
##  A0AVT1_C156:    2   Q8WZ42 : 1024   TTN    : 1024   Min.   : 0.03643  
##  A0AVT1_C17 :    2   Q04721 :  484   NOTCH2 :  484   1st Qu.:23.38571  
##  A0AVT1_C174:    2   P24043 :  326   LAMA2  :  326   Median :26.00000  
##  A0AVT1_C178:    2   Q15751 :  238   HERC1  :  238   Mean   :25.08060  
##  A0AVT1_C197:    2   Q5T4S7 :  238   UBR4   :  238   3rd Qu.:28.21429  
##  A0AVT1_C231:    2   Q70CQ2 :  198          :  206   Max.   :35.00000  
##  (Other)    :69848   (Other):67352   (Other):67344                     
##     CADD_max      assembly    
##  Min.   : 0.116   hg19:34930  
##  1st Qu.:25.100   hg38:34930  
##  Median :28.300               
##  Mean   :27.835               
##  3rd Qu.:32.000               
##  Max.   :53.000               
## 
summary(Kdet)
##           index            xref       Lys_reactivity   Lys_react_threshold
##  A0AVT1_K1014:    2   Q09666 :  210   Min.   : 0.100   Low   :6974        
##  A0AVT1_K409 :    2   P21333 :  100   1st Qu.: 5.800   Medium:1322        
##  A0AVT1_K544 :    2   O75369 :   94   Median :10.000   High  : 608        
##  A0AVT1_K86  :    2   P35579 :   84   Mean   : 7.923   NA's  :9100        
##  A0AVT1_K963 :    2   Q15149 :   72   3rd Qu.:10.000                      
##  A0JNW5_K645 :    2   P00558 :   60   Max.   :10.000                      
##  (Other)     :17992   (Other):17384   NA's   :9100                        
##     Lys_target_label geneNamePrimary    CADD_avg           CADD_max     
##  Target     :  230   AHNAK  :  210   Min.   : 0.01914   Min.   : 0.059  
##  panReactive:15924   FLNA   :  100   1st Qu.:23.48571   1st Qu.:25.600  
##  NA's       : 1850   FLNB   :   94   Median :25.34286   Median :28.400  
##                      MYH9   :   84   Mean   :25.23376   Mean   :28.324  
##                      PLEC   :   72   3rd Qu.:27.27143   3rd Qu.:32.000  
##                      PGK1   :   60   Max.   :34.71429   Max.   :40.000  
##                      (Other):17384                                      
##  assembly   
##  hg19:9002  
##  hg38:9002  
##             
##             
##             
##             
## 
summary(Knot)
##           index             xref        geneNamePrimary 
##  A0AVT1_K1003:     2   Q8WZ42 :  5886   TTN    :  5886  
##  A0AVT1_K1006:     2   P20929 :  1656   NEB    :  1656  
##  A0AVT1_K1011:     2   Q09666 :  1444   AHNAK  :  1444  
##  A0AVT1_K1019:     2   Q8WXH0 :  1188   SYNE2  :  1188  
##  A0AVT1_K1020:     2   Q9UPN3 :  1080   MACF1  :  1080  
##  A0AVT1_K115 :     2   P46013 :   748   MKI67  :   748  
##  (Other)     :290042   (Other):278052   (Other):278052  
##     CADD_avg           CADD_max     assembly     
##  Min.   : 0.02229   Min.   : 0.07   hg19:145027  
##  1st Qu.:22.60000   1st Qu.:24.30   hg38:145027  
##  Median :24.45714   Median :26.90                
##  Mean   :24.04361   Mean   :26.88                
##  3rd Qu.:26.48571   3rd Qu.:29.60                
##  Max.   :36.28571   Max.   :57.00                
## 

GROUP SPECIFIC VIOLIN PLOTS - removed – theme(legend.position=‘none’)

Credblue <- Cdet %>% 
  filter(Cys_react_threshold == "Low" | Cys_react_threshold == "Medium" | Cys_react_threshold == "High") %>% 
  ggplot(aes(x=Cys_react_threshold, y=CADD_avg, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Average missense CADD score for detected Cysteine codons") +
  ylab("CADD score") +
  xlab("Reactivity groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35))

Credblue

ggsave("Creact_avg_violin.png", width = 7, height=4, dpi= 300)
Cpan <- Cdet %>% 
  filter(Cys_target_label == "Target" | Cys_target_label == "panReactive") %>% 
  ggplot(aes(x=Cys_target_label, y=CADD_avg,  fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Average missense CADD score for detected Cysteine codons") +
  ylab("CADD score") +
  xlab("Target groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35))

Cpan

ggsave("Cpan_avg_violin.png", width = 7, height=4, dpi= 300)

LYSINE

Kredblue <- Kdet %>% 
  filter(Lys_react_threshold == "Low" | Lys_react_threshold == "Medium" | Lys_react_threshold == "High") %>% 
  ggplot(aes(x=Lys_react_threshold, y=CADD_avg, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() +  
ggtitle("Average missense CADD score for detected Lysine codons") +
  ylab("CADD score") +
  xlab("Reactivity group") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35))

Kredblue

ggsave("Kreact_avg_violin.png", width = 7, height=4, dpi= 300)
Kpan <- Kdet %>% 
  filter(Lys_target_label == "Target" | Lys_target_label == "panReactive") %>% 
  ggplot(aes(x=Lys_target_label, y=CADD_avg,fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Average missense CADD score for detected Lysine codons") +
  ylab("CADD score") +
  xlab("Target groups") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35))
Kpan

ggsave("Kpan_avg_violin.png", width = 7, height=4, dpi= 300)

CADD specific changes

merge version det vs not detected

caddCmerge <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/CADDspecific_Cdet_not_merge.csv") %>% 
    mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
  mutate(group = factor(group, levels = c(
    "Detected",
    "Other"
  )))

caddKmerge <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/CADDspecific_Kdet_not_merge.csv") %>% 
    mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
  mutate(group = factor(group, levels = c(
    "Detected",
    "Other"
  )))
summary(caddCmerge)
##             pos_id19                 pos_id38           xref       
##  1_000949592_T_A:     2   1_001014212_T_A:     2   Q8WZ42 :  7182  
##  1_000949592_T_C:     2   1_001014212_T_C:     2   Q04721 :  3402  
##  1_000949592_T_G:     2   1_001014212_T_G:     2   P24043 :  2282  
##  1_000949593_G_A:     2   1_001014213_G_A:     2   Q5T4S7 :  1736  
##  1_000949593_G_C:     2   1_001014213_G_C:     2   Q15751 :  1694  
##  1_000949593_G_T:     2   1_001014213_G_T:     2   Q8WXH0 :  1442  
##  (Other)        :575100   (Other)        :575100   (Other):557374  
##  matched_target   lost_amino   gained_amino  Amino_acids    
##  False  :489012   Cys:575112   Arg: 82158   Cys/Arg: 82158  
##  C129   :   392                Gly: 82158   Cys/Gly: 82158  
##  C39    :   364                Phe: 82160   Cys/Phe: 82160  
##  C91    :   336                Ser:164318   Cys/Ser:164318  
##  C106   :   308                Trp: 82158   Cys/Trp: 82158  
##  C111   :   308                Tyr: 82160   Cys/Tyr: 82160  
##  (Other): 84392                                             
##  geneNamePrimary    CADD_score     assembly           group       
##  TTN    :  7182   Min.   : 0.001   hg19:287556   Detected: 86100  
##  NOTCH2 :  3402   1st Qu.:22.900   hg38:287556   Other   :489012  
##  LAMA2  :  2282   Median :25.500                                  
##  UBR4   :  1736   Mean   :24.905                                  
##  HERC1  :  1694   3rd Qu.:28.200                                  
##         :  1562   Max.   :59.000                                  
##  (Other):557254
summary(caddKmerge)
##             pos_id19                  pos_id38            xref        
##  1_000949382_A_C:      2   1_001014002_A_C:      2   Q8WZ42 :  41426  
##  1_000949382_A_G:      2   1_001014002_A_G:      2   P20929 :  11606  
##  1_000949383_A_C:      2   1_001014003_A_C:      2   Q09666 :  11578  
##  1_000949383_A_G:      2   1_001014003_A_G:      2   Q8WXH0 :   8344  
##  1_000949383_A_T:      2   1_001014003_A_T:      2   Q9UPN3 :   7630  
##  1_000949384_G_C:      2   1_001014004_G_C:      2   P46013 :   5236  
##  (Other)        :2156606   (Other)        :2156606   (Other):2070798  
##  matched_target    lost_amino    gained_amino  Amino_acids    
##  False  :2030590   Lys:2156618   Arg:308090   Lys/Arg:308090  
##  K52    :    602                 Asn:616176   Lys/Asn:616176  
##  K63    :    602                 Gln:308086   Lys/Gln:308086  
##  K58    :    588                 Glu:308086   Lys/Glu:308086  
##  K43    :    560                 Ile:138844   Lys/Ile:138844  
##  K50    :    560                 Met:169246   Lys/Met:169246  
##  (Other): 123116                 Thr:308090   Lys/Thr:308090  
##  geneNamePrimary     CADD_score     assembly            group        
##  TTN    :  41426   Min.   : 0.001   hg19:1078309   Detected: 126028  
##  NEB    :  11606   1st Qu.:22.600   hg38:1078309   Other   :2030590  
##  AHNAK  :  11578   Median :24.200                                    
##  SYNE2  :   8344   Mean   :24.112                                    
##  MACF1  :   7630   3rd Qu.:26.600                                    
##         :   5298   Max.   :57.000                                    
##  (Other):2070736
names(caddCmerge)
##  [1] "pos_id19"        "pos_id38"        "xref"           
##  [4] "matched_target"  "lost_amino"      "gained_amino"   
##  [7] "Amino_acids"     "geneNamePrimary" "CADD_score"     
## [10] "assembly"        "group"

hg19 Cys

Cm <- caddCmerge %>% 
filter(assembly == "hg19" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Detected vs other hg19 CADD scores for Cysteine") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60)) 

Cm

ggsave("CADDspec19_Cmerge_violin.png", width = 7, height=4, dpi= 300)

hg38 cys

Cm38 <- caddCmerge %>% 
filter(assembly == "hg38" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Detected vs other hg38 CADD scores for Cysteine") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60)) 

Cm38

ggsave("CADDspec38_Cmerge_violin.png", width = 7, height=4, dpi= 300)

LYSINE merge version

hg19 lysine

Km <- caddKmerge %>% 
filter(assembly == "hg19" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Detected vs other hg19 CADD scores for Lysine") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60)) 

Km

ggsave("CADDspec19_Kmerge_violin.png", width = 7, height=4, dpi= 300)

hg38 lysine

Km38 <- caddKmerge %>% 
filter(assembly == "hg38" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=group)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Detected vs other hg38 CADD scores for Lysine") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60)) 

Km38

ggsave("CADDspec38_Kmerge_violin.png", width = 7, height=4, dpi= 300)

CADD SPECIFIC GROUPS: REACTIVE AND PANTARGET

columns of cadd specific files (not merged)

  • col detected : pos_id19,pos_id38,xref,matched_target,lost_amino,gained_amino,Amino_acids,pos_ID,Cys_reactivity,Cys_react_threshold,Cys_target_label,geneNamePrimary,CADD_score,assembly

  • col not detected : pos_id19,pos_id38,xref,matched_target,lost_amino,gained_amino,Amino_acids,pos_ID_falseCKtarget,geneNamePrimary,CADD_score,assembly

# detected c and k 
cad <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/CADDspecific_C_detected_43050x2rows.csv") %>% 
    mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
    mutate(Cys_react_threshold = factor(Cys_react_threshold, levels = c(
    "Low",
    "Medium",
    "High"
  ))) %>% 
  mutate(Cys_target_label = factor(Cys_target_label, levels = c(
    "Target",
    "panReactive"
  )))

kad <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/CADDspecific_K_detected_63014x2rows.csv") %>% 
    mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) %>% 
    mutate(Lys_react_threshold = factor(Lys_react_threshold, levels = c(
    "Low",
    "Medium",
    "High"
  ))) %>% 
  mutate(Lys_target_label = factor(Lys_target_label, levels = c(
    "Target",
    "panReactive"
  )))

# not detected c and k
cadnot <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/CADDspecific_C_notdetected_244506x2rows.csv") %>% 
    mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) 

kadnot <- read.csv("/Users/mariapalafox/Box\ Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/CADDspecific_K_notdetected_1015295x2rows.csv") %>% 
    mutate(assembly = factor(assembly, levels = c(
    "hg19",
    "hg38"
  ))) 
summary(cad)
##             pos_id19                pos_id38          xref      
##  1_000949592_T_A:    2   1_001014212_T_A:    2   P21333 :  350  
##  1_000949592_T_C:    2   1_001014212_T_C:    2   P49327 :  350  
##  1_000949592_T_G:    2   1_001014212_T_G:    2   Q15149 :  308  
##  1_000949593_G_A:    2   1_001014213_G_A:    2   P78527 :  280  
##  1_000949593_G_C:    2   1_001014213_G_C:    2   Q7Z6Z7 :  280  
##  1_000949593_G_T:    2   1_001014213_G_T:    2   O75369 :  252  
##  (Other)        :86088   (Other)        :86088   (Other):84280  
##  matched_target  lost_amino  gained_amino  Amino_acids   
##  C129   :  392   Cys:86100   Arg:12300    Cys/Arg:12300  
##  C39    :  364               Gly:12300    Cys/Gly:12300  
##  C91    :  336               Phe:12300    Cys/Phe:12300  
##  C106   :  308               Ser:24600    Cys/Ser:24600  
##  C111   :  308               Trp:12300    Cys/Trp:12300  
##  C100   :  294               Tyr:12300    Cys/Tyr:12300  
##  (Other):84098                                           
##          pos_ID      Cys_reactivity  Cys_react_threshold
##  A0AVT1_C347:   14   Min.   : 0.74   Low   :13006       
##  A0AVT1_C433:   14   1st Qu.: 4.32   Medium: 6174       
##  A0AVT1_C546:   14   Median : 5.58   High  : 1316       
##  A0AVT1_C625:   14   Mean   : 5.93   NA's  :65604       
##  A0AVT1_C721:   14   3rd Qu.: 6.83                      
##  A0AVT1_C770:   14   Max.   :20.00                      
##  (Other)    :86016   NA's   :65604                      
##     Cys_target_label geneNamePrimary   CADD_score     assembly    
##  Target     : 9968   FASN   :  350   Min.   : 0.001   hg19:43050  
##  panReactive:71750   FLNA   :  350   1st Qu.:22.200   hg38:43050  
##  NA's       : 4382   PLEC   :  308   Median :24.400               
##                      HUWE1  :  280   Mean   :23.911               
##                      PRKDC  :  280   3rd Qu.:27.500               
##                      DYNC1H1:  252   Max.   :59.000               
##                      (Other):84280
summary(kad)
##             pos_id19                 pos_id38           xref       
##  1_000949463_A_C:     2   1_001014083_A_C:     2   Q09666 :  1470  
##  1_000949463_A_G:     2   1_001014083_A_G:     2   P21333 :   700  
##  1_000949464_A_C:     2   1_001014084_A_C:     2   O75369 :   658  
##  1_000949464_A_G:     2   1_001014084_A_G:     2   P35579 :   588  
##  1_000949464_A_T:     2   1_001014084_A_T:     2   Q15149 :   504  
##  1_000949465_G_C:     2   1_001014085_G_C:     2   P00558 :   420  
##  (Other)        :126016   (Other)        :126016   (Other):121688  
##  matched_target   lost_amino   gained_amino  Amino_acids   
##  K52    :   602   Lys:126028   Arg:18004    Lys/Arg:18004  
##  K63    :   602                Asn:36008    Lys/Asn:36008  
##  K58    :   588                Gln:18004    Lys/Gln:18004  
##  K43    :   560                Glu:18004    Lys/Glu:18004  
##  K50    :   560                Ile: 7252    Lys/Ile: 7252  
##  K68    :   560                Met:10752    Lys/Met:10752  
##  (Other):122556                Thr:18004    Lys/Thr:18004  
##           pos_ID       Lys_reactivity  Lys_react_threshold
##  A0AVT1_K1014:    14   Min.   : 0.10   Low   :48818       
##  A0AVT1_K409 :    14   1st Qu.: 5.80   Medium: 9254       
##  A0AVT1_K544 :    14   Median :10.00   High  : 4256       
##  A0AVT1_K86  :    14   Mean   : 7.92   NA's  :63700       
##  A0AVT1_K963 :    14   3rd Qu.:10.00                      
##  A0JNW5_K645 :    14   Max.   :10.00                      
##  (Other)     :125944   NA's   :63700                      
##     Lys_target_label  geneNamePrimary    CADD_score     assembly    
##  Target     :  1610   AHNAK  :  1470   Min.   : 0.001   hg19:63014  
##  panReactive:111468   FLNA   :   700   1st Qu.:23.100   hg38:63014  
##  NA's       : 12950   FLNB   :   658   Median :24.800               
##                       MYH9   :   588   Mean   :25.234               
##                       PLEC   :   504   3rd Qu.:27.600               
##                       PGK1   :   420   Max.   :40.000               
##                       (Other):121688
names(cad)
##  [1] "pos_id19"            "pos_id38"            "xref"               
##  [4] "matched_target"      "lost_amino"          "gained_amino"       
##  [7] "Amino_acids"         "pos_ID"              "Cys_reactivity"     
## [10] "Cys_react_threshold" "Cys_target_label"    "geneNamePrimary"    
## [13] "CADD_score"          "assembly"
names(kad)
##  [1] "pos_id19"            "pos_id38"            "xref"               
##  [4] "matched_target"      "lost_amino"          "gained_amino"       
##  [7] "Amino_acids"         "pos_ID"              "Lys_reactivity"     
## [10] "Lys_react_threshold" "Lys_target_label"    "geneNamePrimary"    
## [13] "CADD_score"          "assembly"

cys high

CChigh <- cad %>% 
filter(Cys_react_threshold == "High" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for highly reactive Cysteines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35)) 

CChigh

ggsave("CADDmiss_CHIGH_violin.png", width = 7, height=4, dpi= 300)

cys medium

CCmed <- cad %>% 
filter(Cys_react_threshold == "Medium" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for medium reactive Cysteines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35)) 

CCmed

ggsave("CADDmiss_CMEDIUM_violin.png", width = 7, height=4, dpi= 300)

cys low

CClow <- cad %>% 
filter(Cys_react_threshold == "Low" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for low reactive Cysteines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60,70)) 

CClow

ggsave("CADDmiss_CLOW_violin.png", width = 7, height=4, dpi= 300)

lysine versino of LOW MEDIUM HIGH plots

lysine high

KKhigh <- kad %>% 
filter(Lys_react_threshold == "High" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for highly reactive Lysines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35,40)) 

KKhigh

ggsave("CADDmiss_KHIGH_violin.png", width = 7, height=4, dpi= 300)

lysine medium

KKmed <- kad %>% 
filter(Lys_react_threshold == "Medium" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for medium reactive Lysines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35)) 

KKmed

ggsave("CADDmiss_KMEDIUM_violin.png", width = 7, height=4, dpi= 300)

lysine low

KKlow <- kad %>% 
filter(Lys_react_threshold == "Low" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for low reactive Lysines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35,40)) 

KKlow

ggsave("CADDmiss_KLOW_violin.png", width = 7, height=4, dpi= 300)

PANREACTIVE VERSION of above plots

CYS pan

CCpan <- cad %>% 
filter(Cys_target_label == "panReactive" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for panReactive Cysteines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,10,20,30,40,50,60,70)) 

CCpan

ggsave("CADDmiss_CPAN_violin.png", width = 7, height=4, dpi= 300)

cys target

CCtarget <- cad %>% 
filter(Cys_target_label == "Target" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for target Cysteines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35)) 

CCtarget

ggsave("CADDmiss_CTARGET_violin.png", width = 7, height=4, dpi= 300)

lysine pan

KKpan <- kad %>% 
filter(Lys_target_label == "panReactive" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for panReactive Lysines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35,40)) 

KKpan

ggsave("CADDmiss_KPAN_violin.png", width = 7, height=4, dpi= 300)

lysine target

KKtarget <- kad %>% 
filter(Lys_target_label == "Target" ) %>% 
  ggplot(aes(x=Amino_acids, y=CADD_score, fill=assembly)) + geom_violin(alpha=1/3, trim=TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +  theme_classic() + scale_fill_grey() + 
ggtitle("Missense CADD scores for target Lysines") +
  ylab("CADD score") +
  xlab("Missense amino acids") + 
  theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=15, color="black", margin=margin(t=15, b=5)),axis.title.y = element_text(size=15, color="black", margin=margin(t=0, r=15, b=0)), axis.text=element_text(size=12, color="black",margin=margin(t=15, b=5))) +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35)) 

KKtarget

ggsave("CADDmiss_KTARGET_violin.png", width = 7, height=4, dpi= 300)